rm(list = ls())
library(tidyverse)
library(tidymodels)
library(readr)
library(tswge)
library(ggplot2)
library(yardstick)
library(workflows)
library(parsnip)
library(tidyclust)
library(RcppHungarian)
acfdf <- function(vec) {
vacf <- acf(vec, plot = F)
with(vacf, data.frame(lag, acf))
}
ggacf <- function(vec) {
ac <- acfdf(vec)
ggplot(data = ac, aes(x = lag, y = acf)) + geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0))
}
tplot <- function(vec) {
df <- data.frame(X = vec, t = seq_along(vec))
ggplot(data = df, aes(x = t, y = X)) + geom_line()
}Biostat 212a Homework 6
Due Mar 22, 2024 @ 11:59PM
Load R libraries.
1 New York Stock Exchange (NYSE) data (1962-1986) (140 pts)
The NYSE.csv file contains three daily time series from the New York Stock Exchange (NYSE) for the period Dec 3, 1962-Dec 31, 1986 (6,051 trading days).
Log trading volume(\(v_t\)): This is the fraction of all outstanding shares that are traded on that day, relative to a 100-day moving average of past turnover, on the log scale.Dow Jones return(\(r_t\)): This is the difference between the log of the Dow Jones Industrial Index on consecutive trading days.Log volatility(\(z_t\)): This is based on the absolute values of daily price movements.
# Read in NYSE data from url
url = "https://raw.githubusercontent.com/ucla-biostat-212a/2024winter/master/slides/data/NYSE.csv"
NYSE <- read_csv(url)
NYSE# A tibble: 6,051 × 6
date day_of_week DJ_return log_volume log_volatility train
<date> <chr> <dbl> <dbl> <dbl> <lgl>
1 1962-12-03 mon -0.00446 0.0326 -13.1 TRUE
2 1962-12-04 tues 0.00781 0.346 -11.7 TRUE
3 1962-12-05 wed 0.00384 0.525 -11.7 TRUE
4 1962-12-06 thur -0.00346 0.210 -11.6 TRUE
5 1962-12-07 fri 0.000568 0.0442 -11.7 TRUE
6 1962-12-10 mon -0.0108 0.133 -10.9 TRUE
7 1962-12-11 tues 0.000124 -0.0115 -11.0 TRUE
8 1962-12-12 wed 0.00336 0.00161 -11.0 TRUE
9 1962-12-13 thur -0.00330 -0.106 -11.0 TRUE
10 1962-12-14 fri 0.00447 -0.138 -11.0 TRUE
# ℹ 6,041 more rows
The autocorrelation at lag \(\ell\) is the correlation of all pairs \((v_t, v_{t-\ell})\) that are \(\ell\) trading days apart. These sizable correlations give us confidence that past values will be helpful in predicting the future.
Code
ggacf(NYSE$log_volume) + ggthemes::theme_few()Do a similar plot for (1) the correlation between \(v_t\) and lag \(\ell\) Dow Jones return \(r_{t-\ell}\) and (2) correlation between \(v_t\) and lag \(\ell\) Log volatility \(z_{t-\ell}\).
Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$DJ_return, x), use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `DJ return`")Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$log_volatility, x), use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `log volatility`")1.1 Project goal
Our goal is to forecast daily Log trading volume, using various machine learning algorithms we learnt in this class.
The data set is already split into train (before Jan 1st, 1980, \(n_{\text{train}} = 4,281\)) and test (after Jan 1st, 1980, \(n_{\text{test}} = 1,770\)) sets.
In general, we will tune the lag \(L\) to acheive best forecasting performance. In this project, we would fix \(L=5\). That is we always use the previous five trading days’ data to forecast today’s log trading volume.
Pay attention to the nuance of splitting time series data for cross validation. Study and use the time-series functionality in tidymodels. Make sure to use the same splits when tuning different machine learning algorithms.
Use the \(R^2\) between forecast and actual values as the cross validation and test evaluation criterion.
1.2 Baseline method (20 pts)
We use the straw man (use yesterday’s value of log trading volume to predict that of today) as the baseline method. Evaluate the \(R^2\) of this method on the test data.
Before we tune different machine learning methods, let’s first separate into the test and non-test sets. We drop the first 5 trading days which lack some lagged variables.
# Lag: look back L trading days
# Do not need to include, as we included them in receipe
L = 5
for(i in seq(1, L)) {
NYSE <- NYSE %>%
mutate(!!paste("DJ_return_lag", i, sep = "") := lag(NYSE$DJ_return, i),
!!paste("log_volume_lag", i, sep = "") := lag(NYSE$log_volume, i),
!!paste("log_volatility_lag", i, sep = "") := lag(NYSE$log_volatility, i))
}
NYSE <- NYSE %>% na.omit()# Drop beginning trading days which lack some lagged variables
NYSE_other <- NYSE %>%
filter(train == 'TRUE') %>%
select(-train) %>%
drop_na()
dim(NYSE_other)[1] 4276 20
NYSE_test = NYSE %>%
filter(train == 'FALSE') %>%
select(-train) %>%
drop_na()
dim(NYSE_test)[1] 1770 20
library(yardstick)
# cor(NYSE_test$log_volume, NYSE_test$log_volume_lag1) %>% round(2)
r2_test_strawman = rsq_vec(NYSE_test$log_volume, lag(NYSE_test$log_volume, 1)) %>% round(2)
print(paste("Straw man test R2: ", r2_test_strawman))[1] "Straw man test R2: 0.35"
1.3 Autoregression (AR) forecaster (30 pts)
Let \[ y = \begin{pmatrix} v_{L+1} \\ v_{L+2} \\ v_{L+3} \\ \vdots \\ v_T \end{pmatrix}, \quad M = \begin{pmatrix} 1 & v_L & v_{L-1} & \cdots & v_1 \\ 1 & v_{L+1} & v_{L} & \cdots & v_2 \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 1 & v_{T-1} & v_{T-2} & \cdots & v_{T-L} \end{pmatrix}. \]
Fit an ordinary least squares (OLS) regression of \(y\) on \(M\), giving \[ \hat v_t = \hat \beta_0 + \hat \beta_1 v_{t-1} + \hat \beta_2 v_{t-2} + \cdots + \hat \beta_L v_{t-L}, \] known as an order-\(L\) autoregression model or AR(\(L\)).
Before we start the model training, let’s talk about time series resampling. We will use the
rolling_originfunction in thersamplepackage to create a time series cross-validation plan.When the data have a strong time component, a resampling method should support modeling to estimate seasonal and other temporal trends within the data. A technique that randomly samples values from the training set can disrupt the model’s ability to estimate these patterns.
NYSE %>%
ggplot(aes(x = date, y = log_volume)) +
geom_line() +
geom_smooth(method = "lm")correct_split <- initial_time_split(NYSE_other %>% arrange(date))
bind_rows(
training(correct_split) %>% mutate(type = "train"),
testing(correct_split) %>% mutate(type = "test")
) %>%
ggplot(aes(x = date, y = log_volume, color = type, group = NA)) +
geom_line()rolling_origin(NYSE_other %>% arrange(date), initial = 30, assess = 7) %>%
#sliding_period(NYSE_other %>% arrange(date), date, period = "day", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits, analysis),
test_data = map(splits, assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice0001", "Slice0002", "Slice0003")) %>%
unnest(value) %>%
ggplot(aes(x = date, y = log_volume, color = name, group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")sliding_period(NYSE_other %>% arrange(date),
date, period = "month", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits, analysis),
test_data = map(splits, assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice001", "Slice002", "Slice003")) %>%
unnest(value) %>%
ggplot(aes(x = date, y = log_volume, color = name, group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")Rolling forecast origin resampling (Hyndman and Athanasopoulos 2018) provides a method that emulates how time series data is often partitioned in practice, estimating the model with historical data and evaluating it with the most recent data.
Tune AR(5) with elastic net (lasso + ridge) regularization using all 3 features on the training data, and evaluate the test performance.
1.3.1 Recipe
en_recipe <-
recipe(log_volume ~ ., data = NYSE_other) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_naomit(all_predictors()) %>%
prep(data = NYSE_other)1.3.2 Model
### Model
enet_mod <-
# mixture = 0 (ridge), mixture = 1 (lasso)
# mixture = (0, 1) elastic net
# As an example, we set mixture = 0.5. It needs to be tuned.
linear_reg(penalty = tune(), mixture = tune()) %>%
set_engine("glmnet")
enet_modLinear Regression Model Specification (regression)
Main Arguments:
penalty = tune()
mixture = tune()
Computational engine: glmnet
1.3.3 Workflow
en_wf <-
workflow() %>%
add_model(enet_mod) %>%
add_recipe(en_recipe %>% step_rm(date) %>% step_indicate_na())
en_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = tune()
mixture = tune()
Computational engine: glmnet
1.3.4 Tuning grid
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)lambda_grid <- grid_regular(
penalty(range = c(-8, -4), trans = log10_trans()),
mixture(range = c(0, 1)),
levels = 3
)
lambda_grid# A tibble: 9 × 2
penalty mixture
<dbl> <dbl>
1 0.00000001 0
2 0.000001 0
3 0.0001 0
4 0.00000001 0.5
5 0.000001 0.5
6 0.0001 0.5
7 0.00000001 1
8 0.000001 1
9 0.0001 1
1.3.5 CV
en_fit <- tune_grid(en_wf, resamples = month_folds, grid = lambda_grid) %>%
collect_metrics()
en_fit# A tibble: 18 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 0 rmse standard 0.138 200 0.00297 Preprocessor1_Mode…
2 0.00000001 0 rsq standard 0.339 200 0.0147 Preprocessor1_Mode…
3 0.000001 0 rmse standard 0.138 200 0.00297 Preprocessor1_Mode…
4 0.000001 0 rsq standard 0.339 200 0.0147 Preprocessor1_Mode…
5 0.0001 0 rmse standard 0.138 200 0.00297 Preprocessor1_Mode…
6 0.0001 0 rsq standard 0.339 200 0.0147 Preprocessor1_Mode…
7 0.00000001 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Mode…
8 0.00000001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Mode…
9 0.000001 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Mode…
10 0.000001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Mode…
11 0.0001 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Mode…
12 0.0001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Mode…
13 0.00000001 1 rmse standard 0.133 200 0.00287 Preprocessor1_Mode…
14 0.00000001 1 rsq standard 0.381 200 0.0149 Preprocessor1_Mode…
15 0.000001 1 rmse standard 0.133 200 0.00287 Preprocessor1_Mode…
16 0.000001 1 rsq standard 0.381 200 0.0149 Preprocessor1_Mode…
17 0.0001 1 rmse standard 0.133 200 0.00287 Preprocessor1_Mode…
18 0.0001 1 rsq standard 0.380 200 0.0149 Preprocessor1_Mode…
Visualize CV criterion:
en_fit %>%
filter(.metric == "rsq") %>%
ggplot(mapping = aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
labs(x = "Penalty", y = "CV RSQ") +
scale_x_log10(labels = scales::label_number())Select the best model:
best_en <- en_fit %>% filter(.metric == "rsq") %>% top_n(1, wt = mean)
best_en# A tibble: 2 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Model4
2 0.000001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Model5
1.3.6 Finalize the model
# Final workflow
final_wf <- en_wf %>%
finalize_workflow(best_en[1, ])
final_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = 1e-08
mixture = 0.5
Computational engine: glmnet
# Fit the whole training set, then predict the test cases
final_fit <-
final_wf %>%
last_fit(correct_split)
final_fit# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3207/1069]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_fit %>% collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.138 Preprocessor1_Model1
2 rsq standard 0.688 Preprocessor1_Model1
# en_pred <- predict(final_fit, new_data = NYSE_test)$.pred
# en_pred1.4 Random forest forecaster (30pts)
1.4.1 Recipe
rf_recipe <-
recipe(log_volume ~ ., data = NYSE_other) %>%
# step_dummy(all_nominal(), -all_outcomes()) %>%
# step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_naomit(log_volume) %>%
step_zv(all_numeric_predictors())1.4.2 Model
rf_mod <-
rand_forest(
mode = "regression",
mtry = tune(),
trees = tune()
) %>%
set_engine("ranger")
rf_modRandom Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
1.4.3 Workflow
rf_wf <-
workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_mod)
rf_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_naomit()
• step_zv()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
1.4.4 Tuning grid
rf_param_grid <-
grid_regular(
mtry(range = c(1L, 5L)),
trees(range = c(300L, 900L)),
levels = c(3, 5)
)
rf_param_grid# A tibble: 15 × 2
mtry trees
<int> <int>
1 1 300
2 3 300
3 5 300
4 1 450
5 3 450
6 5 450
7 1 600
8 3 600
9 5 600
10 1 750
11 3 750
12 5 750
13 1 900
14 3 900
15 5 900
1.4.5 CV
set.seed(203)
folds <- vfold_cv(NYSE_other, v = 5)
folds# 5-fold cross-validation
# A tibble: 5 × 2
splits id
<list> <chr>
1 <split [3420/856]> Fold1
2 <split [3421/855]> Fold2
3 <split [3421/855]> Fold3
4 <split [3421/855]> Fold4
5 <split [3421/855]> Fold5
rf_fit <- rf_wf %>%
tune_grid(
resamples = folds,
grid = rf_param_grid,
metrics = metric_set(rmse, rsq)
)
rf_fit# Tuning results
# 5-fold cross-validation
# A tibble: 5 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [3420/856]> Fold1 <tibble [30 × 6]> <tibble [0 × 3]>
2 <split [3421/855]> Fold2 <tibble [30 × 6]> <tibble [0 × 3]>
3 <split [3421/855]> Fold3 <tibble [30 × 6]> <tibble [0 × 3]>
4 <split [3421/855]> Fold4 <tibble [30 × 6]> <tibble [0 × 3]>
5 <split [3421/855]> Fold5 <tibble [30 × 6]> <tibble [0 × 3]>
Visualize CV results:
rf_fit %>%
collect_metrics() %>%
print(width = Inf) %>%
filter(.metric == "rsq") %>%
mutate(mtry = as.factor(mtry)) %>%
ggplot(mapping = aes(x = trees, y = mean, color = mtry)) +
# geom_point() +
geom_line() +
labs(x = "Num. of Trees", y = "CV rsq")# A tibble: 30 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 300 rmse standard 0.150 5 0.00346 Preprocessor1_Model01
2 1 300 rsq standard 0.617 5 0.0157 Preprocessor1_Model01
3 3 300 rmse standard 0.140 5 0.00358 Preprocessor1_Model02
4 3 300 rsq standard 0.642 5 0.0138 Preprocessor1_Model02
5 5 300 rmse standard 0.138 5 0.00368 Preprocessor1_Model03
6 5 300 rsq standard 0.649 5 0.0144 Preprocessor1_Model03
7 1 450 rmse standard 0.150 5 0.00362 Preprocessor1_Model04
8 1 450 rsq standard 0.619 5 0.0162 Preprocessor1_Model04
9 3 450 rmse standard 0.140 5 0.00371 Preprocessor1_Model05
10 3 450 rsq standard 0.644 5 0.0141 Preprocessor1_Model05
# ℹ 20 more rows
Show the top 5 models:
rf_fit %>%
show_best("rsq")# A tibble: 5 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 300 rsq standard 0.649 5 0.0144 Preprocessor1_Model03
2 5 750 rsq standard 0.649 5 0.0131 Preprocessor1_Model12
3 5 450 rsq standard 0.648 5 0.0135 Preprocessor1_Model06
4 5 900 rsq standard 0.648 5 0.0138 Preprocessor1_Model15
5 5 600 rsq standard 0.648 5 0.0136 Preprocessor1_Model09
Select the best:
best_rf <- rf_fit %>%
select_best("rsq")
best_rf# A tibble: 1 × 3
mtry trees .config
<int> <int> <chr>
1 5 300 Preprocessor1_Model03
1.4.6 Finalize the model
# Final workflow
final_wf <- rf_wf %>%
finalize_workflow(best_rf)
final_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_naomit()
• step_zv()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = 5
trees = 300
Computational engine: ranger
# # Fit the whole training set, then predict the test cases
# train_split <- new_rset(NYSE_other, ids = "train")
# test_split <- new_rset(NYSE_test, ids = "test")
#
# # Combine the rsplit objects into a manual_rset object
# data_split <- manual_rset(list(train = train_split, test = test_split), ids = c("train", "test"))
final_fit <-
final_wf %>%
last_fit(correct_split)
final_fit# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3207/1069]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_fit %>% collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.154 Preprocessor1_Model1
2 rsq standard 0.653 Preprocessor1_Model1
1.5 Boosting forecaster (30pts)
1.5.1 Recipe
gb_recipe <-
recipe(
log_volume ~ .,
data = NYSE_other
) %>%
step_dummy(all_nominal()) %>%
step_naomit(log_volume) %>%
step_zv(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
prep(data = NYSE_other, retain = TRUE)
gb_recipe1.5.2 Model
gb_mod <-
boost_tree(
mode = "regression",
trees = 1000,
tree_depth = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost")
gb_modBoosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
1.5.3 Workflow
gb_wf <-
workflow() %>%
add_model(gb_mod) %>%
add_recipe(gb_recipe %>% step_rm(date) %>%
step_indicate_na())
gb_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
6 Recipe Steps
• step_dummy()
• step_naomit()
• step_zv()
• step_normalize()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
1.5.4 Tuning grid
param_grid <- grid_regular(
tree_depth(range = c(1L, 4L)),
learn_rate(range = c(-3, -0.5), trans = log10_trans()),
levels = c(4, 10)
)
param_grid# A tibble: 40 × 2
tree_depth learn_rate
<int> <dbl>
1 1 0.001
2 2 0.001
3 3 0.001
4 4 0.001
5 1 0.00190
6 2 0.00190
7 3 0.00190
8 4 0.00190
9 1 0.00359
10 2 0.00359
# ℹ 30 more rows
1.5.5 CV
gb_fit <- gb_wf %>%
tune_grid(
resamples = folds,
grid = param_grid,
metrics = metric_set(rmse, rsq)
)
gb_fit# Tuning results
# 5-fold cross-validation
# A tibble: 5 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [3420/856]> Fold1 <tibble [80 × 6]> <tibble [0 × 3]>
2 <split [3421/855]> Fold2 <tibble [80 × 6]> <tibble [0 × 3]>
3 <split [3421/855]> Fold3 <tibble [80 × 6]> <tibble [0 × 3]>
4 <split [3421/855]> Fold4 <tibble [80 × 6]> <tibble [0 × 3]>
5 <split [3421/855]> Fold5 <tibble [80 × 6]> <tibble [0 × 3]>
Visualize CV criterion:
gb_fit %>%
collect_metrics() %>%
print(width = Inf) %>%
filter(.metric == "rsq") %>%
ggplot(mapping = aes(x = learn_rate, y = mean, color = factor(tree_depth))) +
geom_point() +
geom_line() +
labs(x = "Learning Rate", y = "RSQ") +
scale_x_log10()# A tibble: 80 × 8
tree_depth learn_rate .metric .estimator mean n std_err
<int> <dbl> <chr> <chr> <dbl> <int> <dbl>
1 1 0.001 rmse standard 0.266 5 0.00357
2 1 0.001 rsq standard 0.465 5 0.0167
3 2 0.001 rmse standard 0.257 5 0.00354
4 2 0.001 rsq standard 0.530 5 0.0161
5 3 0.001 rmse standard 0.253 5 0.00357
6 3 0.001 rsq standard 0.564 5 0.0162
7 4 0.001 rmse standard 0.251 5 0.00345
8 4 0.001 rsq standard 0.590 5 0.0153
9 1 0.00190 rmse standard 0.190 5 0.00334
10 1 0.00190 rsq standard 0.509 5 0.0158
.config
<chr>
1 Preprocessor1_Model01
2 Preprocessor1_Model01
3 Preprocessor1_Model02
4 Preprocessor1_Model02
5 Preprocessor1_Model03
6 Preprocessor1_Model03
7 Preprocessor1_Model04
8 Preprocessor1_Model04
9 Preprocessor1_Model05
10 Preprocessor1_Model05
# ℹ 70 more rows
Show the top 5 models:
gb_fit %>%
show_best("rsq")# A tibble: 5 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 4 0.0129 rsq standard 0.668 5 0.0122 Preprocessor1_Mo…
2 2 0.0464 rsq standard 0.667 5 0.0112 Preprocessor1_Mo…
3 3 0.0245 rsq standard 0.667 5 0.0119 Preprocessor1_Mo…
4 2 0.0245 rsq standard 0.666 5 0.0110 Preprocessor1_Mo…
5 3 0.0129 rsq standard 0.665 5 0.0114 Preprocessor1_Mo…
Select the best:
best_gb <- gb_fit %>%
select_best("rsq")
best_gb# A tibble: 1 × 3
tree_depth learn_rate .config
<int> <dbl> <chr>
1 4 0.0129 Preprocessor1_Model20
1.5.6 Finalize the model
# Final workflow
final_wf <- gb_wf %>%
finalize_workflow(best_gb)
final_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
6 Recipe Steps
• step_dummy()
• step_naomit()
• step_zv()
• step_normalize()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = 4
learn_rate = 0.0129154966501488
Computational engine: xgboost
# Fit the whole training set, then predict the test cases
final_fit <-
final_wf %>%
last_fit(correct_split)
final_fit# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3207/1069]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_fit %>% collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.141 Preprocessor1_Model1
2 rsq standard 0.682 Preprocessor1_Model1
1.6 Summary (30pts)
Your score for this question is largely determined by your final test performance.
Summarize the performance of different machine learning forecasters in the following format.
| Method | CV \(R^2\) | Test \(R^2\) | |
|---|---|---|---|
| Baseline | 0.35 | ||
| AR(5) | 0.3807083 | 0.6883465 | |
| Random Forest | 0.6482922 | 0.6545564 | |
| Boosting | 0.6676416 | 0.6820797 |
1.7 Extension reading
2 ISL Exercise 12.6.13 (90 pts)
Ch12Ex13 <- read_csv("~/212A/slides/data/Ch12Ex13.csv", col_names = paste("ID", 1:40, sep = ""))# hc.complete <- hclust(as.dist(1 - cor(Ch12Ex13)), method = "complete")
# plot(hc.complete)
#
# hc.average <- hclust(as.dist(1 - cor(Ch12Ex13)), method = "average")
# plot(hc.average)
#
# hc.single <- hclust(as.dist(1 - cor(Ch12Ex13)), method = "single")
# plot(hc.single)
#
# hc.centroid <- hclust(as.dist(1 - cor(Ch12Ex13)), method = "centroid")
# plot(hc.centroid)2.1 12.6.13 (b) (30 pts)
The samples are consistently separated into two groups regardless of the linkage method used, although the specific results vary depending on the chosen linkage method.
hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "single"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()set.seed(838383)
hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "average"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "complete"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "centroid"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()2.2 PCA and UMAP (30 pts)
library(tidyverse)
library(tidymodels)- PCA
transposed_gene <- as_tibble(t(Ch12Ex13)) |>
mutate(group = rep(c("healthy", "diseased"), each = 20))Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
pca_rec <- recipe(~., data = transposed_gene) |>
update_role(group, new_role = "id") |>
step_normalize(all_predictors()) |>
step_pca(all_predictors())
pca_prep <- prep(pca_rec)
pca_preplibrary(tidytext)
tidied_pca <- tidy(pca_prep, 2)
tidied_pca |>
filter(component %in% paste0("PC", 1:4)) |>
group_by(component) |>
top_n(8, abs(value)) |>
ungroup() |>
mutate(terms = reorder_within(terms, abs(value), component)) |>
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)juice(pca_prep) %>%
ggplot(aes(PC1, PC2)) +
geom_point(aes(color = group), alpha = 0.7, size = 2) +
#geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)It seems that the first two principal components separate the samples into the two groups.
- UMAP
library(embed)
umap_rec <- recipe(~., data = transposed_gene) |>
update_role(group, new_role = "id") |>
step_normalize(all_predictors()) |>
step_umap(all_predictors())
umap_prep <- prep(umap_rec)
umap_prepjuice(umap_prep) %>%
ggplot(aes(UMAP1, UMAP2)) +
geom_point(aes(color = group), alpha = 0.7, size = 2) +
# geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)2.3 12.6.13 (c) (30 pts)
grp = factor(rep(c(1, 0), each = 20))
regression <- function(y) {
sum <- summary(lm(y ~ grp))
pv <- sum$coefficients[2, 4]
return(pv)
}
out <- tibble(gene = seq(1, nrow(Ch12Ex13)),
p_values = unlist(purrr:: map(1:nrow(Ch12Ex13), ~regression(as.matrix(Ch12Ex13)[.x, ]))))- These genes differ the most across the two groups(p < 0.05):
out %>% arrange(p_values) %>% head(10)# A tibble: 10 × 2
gene p_values
<int> <dbl>
1 502 1.43e-12
2 589 3.19e-12
3 600 9.81e-12
4 590 6.28e-11
5 565 1.74e-10
6 551 1.10e- 9
7 593 1.19e- 9
8 584 1.65e- 9
9 538 2.42e- 9
10 569 2.69e- 9
# sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05/nrow(Ch12Ex13))
sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05 )# install.packages("pheatmap")
library(pheatmap)
# install.packages("ggplotify")
library(ggplotify) ## to convert pheatmap to ggplot2
# install.packages("heatmaply")
library(heatmaply) ## for constructing interactive heatmap#create data frame for annotations
dfh <- data.frame(sample=as.character(colnames(Ch12Ex13)), status = "disease") %>%
column_to_rownames("sample")
dfh$status[seq(21, 40)] <- "healthy"
dfh status
ID1 disease
ID2 disease
ID3 disease
ID4 disease
ID5 disease
ID6 disease
ID7 disease
ID8 disease
ID9 disease
ID10 disease
ID11 disease
ID12 disease
ID13 disease
ID14 disease
ID15 disease
ID16 disease
ID17 disease
ID18 disease
ID19 disease
ID20 disease
ID21 healthy
ID22 healthy
ID23 healthy
ID24 healthy
ID25 healthy
ID26 healthy
ID27 healthy
ID28 healthy
ID29 healthy
ID30 healthy
ID31 healthy
ID32 healthy
ID33 healthy
ID34 healthy
ID35 healthy
ID36 healthy
ID37 healthy
ID38 healthy
ID39 healthy
ID40 healthy
pheatmap(Ch12Ex13[sig$gene, ], cluster_rows = FALSE, cluster_cols = T, scale="row", annotation_col = dfh,
annotation_colors=list(status = c(disease = "orange", healthy = "black")),
color=colorRampPalette(c("navy", "white", "red"))(50))